{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4753b7fa-091f-477a-840f-61f9c7a493c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from tqdm import tqdm  # For progress bar"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfe4e955-4283-4621-ac2b-c37b3d099682",
   "metadata": {},
   "source": [
    "### 0) Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dd9d945e-4a50-456c-895a-9df1d60fa8f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "#load the data\n",
    "with open('./data/S3/combined_train_data.csv', 'r', encoding='utf-8', errors='replace') as file:\n",
    "    df = pd.read_csv(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a8b29152-e568-4e16-b3c6-7bee54ef1dcb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into features (X) and target (y)\n",
    "X = df['text']  # Use 'text' for training\n",
    "y = df['poli']  # Binary target for political relevance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "588a7515-0392-4e3a-937f-c842ecb382fb",
   "metadata": {},
   "source": [
    "### 1) Finding the best model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6e502f5c-0e8e-4395-8a61-5849aacffb15",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cross-Validation Progress: 100%|██████████| 5/5 [00:29<00:00,  5.91s/it]\n",
      "Cross-Validation Progress: 100%|██████████| 5/5 [00:21<00:00,  4.31s/it]\n",
      "Cross-Validation Progress: 100%|██████████| 5/5 [05:04<00:00, 60.94s/it]\n",
      "Cross-Validation Progress: 100%|██████████| 5/5 [11:03<00:00, 132.62s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logit + Count (5-fold CV): Mean Precision=0.866, Std=0.000, Mean Recall=0.718, Std=0.000, Mean F1=0.765, Std=0.000\n",
      "Logit + TFIDF (5-fold CV): Mean Precision=0.953, Std=0.000, Mean Recall=0.554, Std=0.000, Mean F1=0.587, Std=0.000\n",
      "XGBoost + Count (5-fold CV): Mean Precision=0.839, Std=0.000, Mean Recall=0.740, Std=0.000, Mean F1=0.778, Std=0.000\n",
      "XGBoost + TFIDF (5-fold CV): Mean Precision=0.851, Std=0.000, Mean Recall=0.735, Std=0.000, Mean F1=0.776, Std=0.000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Assuming 'df' is already defined and loaded with data\n",
    "# Split the data into features (X) and target (y)\n",
    "X = df['text']  # Use 'text' for training\n",
    "y = df['poli']  # Binary target for political relevance\n",
    "\n",
    "# Vectorizers\n",
    "count_vectorizer = CountVectorizer()\n",
    "tfidf_vectorizer = TfidfVectorizer()\n",
    "\n",
    "# Transform the text data into vectors\n",
    "X_count = count_vectorizer.fit_transform(X)\n",
    "X_tfidf = tfidf_vectorizer.fit_transform(X)\n",
    "\n",
    "# Logistic Regression model\n",
    "logit_count = LogisticRegression(random_state=42)\n",
    "logit_tfidf = LogisticRegression(random_state=42)\n",
    "\n",
    "# XGBoost model\n",
    "xgb_count = XGBClassifier(random_state=42, learning_rate=0.5, use_label_encoder=False)\n",
    "xgb_tfidf = XGBClassifier(random_state=42, learning_rate=0.5, use_label_encoder=False)\n",
    "\n",
    "# Train models\n",
    "logit_count.fit(X_count, y)\n",
    "logit_tfidf.fit(X_tfidf, y)\n",
    "xgb_count.fit(X_count, y)\n",
    "xgb_tfidf.fit(X_tfidf, y)\n",
    "\n",
    "# Function to evaluate models using cross-validation with progress bar\n",
    "def evaluate_model_cv_with_progress(model, X, y, cv=5):\n",
    "    precision_scores = []\n",
    "    recall_scores = []\n",
    "    f1_scores = []\n",
    "    \n",
    "    # Use tqdm to display progress\n",
    "    for fold in tqdm(range(cv), desc=\"Cross-Validation Progress\"):\n",
    "        precision = cross_val_score(model, X, y, cv=cv, scoring='precision_macro')\n",
    "        recall = cross_val_score(model, X, y, cv=cv, scoring='recall_macro')\n",
    "        f1 = cross_val_score(model, X, y, cv=cv, scoring='f1_macro')\n",
    "        \n",
    "        precision_scores.append(precision.mean())\n",
    "        recall_scores.append(recall.mean())\n",
    "        f1_scores.append(f1.mean())\n",
    "        \n",
    "    # Return mean and std deviation of scores\n",
    "    return (sum(precision_scores) / cv, pd.Series(precision_scores).std()), \\\n",
    "           (sum(recall_scores) / cv, pd.Series(recall_scores).std()), \\\n",
    "           (sum(f1_scores) / cv, pd.Series(f1_scores).std())\n",
    "\n",
    "# 5-fold CV for Logistic Regression with progress bars\n",
    "precision_logit_count, recall_logit_count, f1_logit_count = evaluate_model_cv_with_progress(logit_count, X_count, y)\n",
    "precision_logit_tfidf, recall_logit_tfidf, f1_logit_tfidf = evaluate_model_cv_with_progress(logit_tfidf, X_tfidf, y)\n",
    "\n",
    "# 5-fold CV for XGBoost with progress bars\n",
    "precision_xgb_count, recall_xgb_count, f1_xgb_count = evaluate_model_cv_with_progress(xgb_count, X_count, y)\n",
    "precision_xgb_tfidf, recall_xgb_tfidf, f1_xgb_tfidf = evaluate_model_cv_with_progress(xgb_tfidf, X_tfidf, y)\n",
    "\n",
    "# Output results\n",
    "print(f\"Logit + Count (5-fold CV): Mean Precision={precision_logit_count[0]:.3f}, Std={precision_logit_count[1]:.3f}, \"\n",
    "      f\"Mean Recall={recall_logit_count[0]:.3f}, Std={recall_logit_count[1]:.3f}, \"\n",
    "      f\"Mean F1={f1_logit_count[0]:.3f}, Std={f1_logit_count[1]:.3f}\")\n",
    "\n",
    "print(f\"Logit + TFIDF (5-fold CV): Mean Precision={precision_logit_tfidf[0]:.3f}, Std={precision_logit_tfidf[1]:.3f}, \"\n",
    "      f\"Mean Recall={recall_logit_tfidf[0]:.3f}, Std={recall_logit_tfidf[1]:.3f}, \"\n",
    "      f\"Mean F1={f1_logit_tfidf[0]:.3f}, Std={f1_logit_tfidf[1]:.3f}\")\n",
    "\n",
    "print(f\"XGBoost + Count (5-fold CV): Mean Precision={precision_xgb_count[0]:.3f}, Std={precision_xgb_count[1]:.3f}, \"\n",
    "      f\"Mean Recall={recall_xgb_count[0]:.3f}, Std={recall_xgb_count[1]:.3f}, \"\n",
    "      f\"Mean F1={f1_xgb_count[0]:.3f}, Std={f1_xgb_count[1]:.3f}\")\n",
    "\n",
    "print(f\"XGBoost + TFIDF (5-fold CV): Mean Precision={precision_xgb_tfidf[0]:.3f}, Std={precision_xgb_tfidf[1]:.3f}, \"\n",
    "      f\"Mean Recall={recall_xgb_tfidf[0]:.3f}, Std={recall_xgb_tfidf[1]:.3f}, \"\n",
    "      f\"Mean F1={f1_xgb_tfidf[0]:.3f}, Std={f1_xgb_tfidf[1]:.3f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ea16598-d9b6-4d80-9bcc-ffec02faf42c",
   "metadata": {},
   "source": [
    "### 2) machine label unlabelled data with the chosen model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3f31d447-8de7-4708-be56-fe2e1e065a84",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./data/S3/combined_train_data.csv', 'r', encoding='utf-8', errors='replace') as file:\n",
    "    df_train = pd.read_csv(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8aca3473-d6e5-42fd-9c0c-7b3d0160088c",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./data/S3/jch_unlabelled.csv', 'r', encoding='utf-8', errors='replace') as file:\n",
    "    df_unlabelled = pd.read_csv(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bc0c40da-87a4-4c09-a89e-2b2fe110117e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Predicting Unlabelled Data: 100%|██████████| 45478/45478 [23:02<00:00, 32.90it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predictions for unlabelled data:\n",
      "                                                text  predicted_poli\n",
      "0  LIT DAILY STREAMER. RIZZ GOD HAS ARRIVED. LIT ...               0\n",
      "1  Big man. No plan yet. Youtube night. Actually ...               0\n",
      "2  NEW BIT ALERTS ARE LIVE COME GIVE EM A TRY . l...               0\n",
      "3  �� SPEED DATING VIEWERS �� {FACECAM} ft. TheNi...               0\n",
      "4  Sleep Stream W/ Julia & Codee | !Scare in chat...               0\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from xgboost import XGBClassifier\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Split the labeled data into features (X) and target (y)\n",
    "X_train = df_train['text']  # Use 'text' for training\n",
    "y_train = df_train['poli']  # Binary target for political relevance\n",
    "\n",
    "# Load your unlabelled data\n",
    "X_unlabelled = df_unlabelled['text']  # Use the relevant column for text\n",
    "\n",
    "# Vectorization using TF-IDF\n",
    "tfidf_vectorizer = TfidfVectorizer()\n",
    "\n",
    "# Fit the vectorizer on the training data and transform both the training and unlabelled data\n",
    "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
    "X_unlabelled_tfidf = tfidf_vectorizer.transform(X_unlabelled)\n",
    "\n",
    "# Initialize and train the XGBoost model\n",
    "xgb_model = XGBClassifier(random_state=42, learning_rate=0.5, use_label_encoder=False)\n",
    "\n",
    "# Fit the model on the training data\n",
    "xgb_model.fit(X_train_tfidf, y_train)\n",
    "\n",
    "# Initialize an empty list to store predictions\n",
    "predictions = []\n",
    "\n",
    "# Use tqdm to show a progress bar while predicting\n",
    "for i in tqdm(range(len(X_unlabelled)), desc=\"Predicting Unlabelled Data\"):\n",
    "    # Predict for each instance\n",
    "    pred = xgb_model.predict(X_unlabelled_tfidf[i:i + 1])  # Predict for a single instance\n",
    "    predictions.append(pred[0])  # Append the predicted label\n",
    "\n",
    "# Add predictions to the unlabelled data DataFrame\n",
    "df_unlabelled['poli'] = predictions\n",
    "\n",
    "# Save the predictions to a new CSV file\n",
    "#df_unlabelled.to_csv('./data/S3/unlabelled_predictions.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d67fce0-395c-48b2-b53f-f5c311af5bab",
   "metadata": {},
   "source": [
    "### 3) Combine it with the already labelled data and save the result "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "dad6b404-9057-4274-bc0d-e8dfdbbcdf44",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./data/S3/unlabelled_predictions.csv', 'r', encoding='utf-8', errors='replace') as file:\n",
    "    ml_labelled = pd.read_csv(file)\n",
    "\n",
    "with open('./data/S3/combined_train_data.csv', 'r', encoding='utf-8', errors='replace') as file:\n",
    "    hand_labelled = pd.read_csv(file)\n",
    "\n",
    "\n",
    "combined_df = pd.concat([ml_labelled, hand_labelled], ignore_index=True)\n",
    "\n",
    "#filter out only poltiical accounts\n",
    "poli_df=combined_df[combined_df[\"poli\"] == 1]\n",
    "\n",
    "# Keep only the unique observations based on the 'userlogin' column since there could be duplicates (whomever streamed with two or more game-names)\n",
    "poli_df = poli_df.drop_duplicates(subset='userlogin')\n",
    "\n",
    "poli_df.to_csv('./data/S3/final_political_streamers.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
